(module
  (memory 1000)

  (func (export "memory.copy")
    (param $dest i32) (param $source i32) (param $numBytes i32)
    (local.get $dest)
    (local.get $source)
    (local.get $numBytes)
    memory.copy
  )
  
  (func (export "memory.copy8")
    (param $dest i32) (param $source i32)
    (local.get $dest)
    (local.get $source)
    (i32.const 8)
    memory.copy
  )

  (func (export "memory.copy16")
    (param $dest i32) (param $source i32)
    (local.get $dest)
    (local.get $source)
    (i32.const 16)
    memory.copy
  )
  
  (func (export "memory.copy32")
    (param $dest i32) (param $source i32)
    (local.get $dest)
    (local.get $source)
    (i32.const 32)
    memory.copy
  )
  
  (func (export "memory.copy64")
    (param $dest i32) (param $source i32)
    (local.get $dest)
    (local.get $source)
    (i32.const 64)
    memory.copy
  )
  
  (func (export "memory.copy128")
    (param $dest i32) (param $source i32)
    (local.get $dest)
    (local.get $source)
    (i32.const 128)
    memory.copy
  )
  
  (func (export "memory.copy256")
    (param $dest i32) (param $source i32)
    (local.get $dest)
    (local.get $source)
    (i32.const 256)
    memory.copy
  )

  (func (export "i64 copy loop")
    (param $dest i32) (param $source i32) (param $numBytes i32)
    (if $exitCopyLoop (i32.lt_u (local.get $source) (local.get $dest))
      (then
        loop $copyLoop
          (br_if $exitCopyLoop (i32.eq (local.get $numBytes) (i32.const 0)))
          (local.set $numBytes (i32.sub (local.get $numBytes) (i32.const 8)))
          (i64.store
            (i32.add (local.get $dest) (local.get $numBytes))
            (i64.load (i32.add (local.get $source) (local.get $numBytes))))
          (br $copyLoop)
        end
      ) (else
        loop $copyLoop
          (br_if $exitCopyLoop (i32.eq (local.get $numBytes) (i32.const 0)))
          (i64.store
            (local.get $dest)
            (i64.load (local.get $source)))
          (local.set $numBytes (i32.sub (local.get $numBytes) (i32.const 8)))
          (local.set $dest (i32.add (local.get $dest) (i32.const 8)))
          (local.set $source (i32.add (local.get $source) (i32.const 8)))
          (br $copyLoop)
        end
      )
    )
  )

  (func (export "i8 copy loop")
    (param $dest i32) (param $source i32) (param $numBytes i32)
    (if $exitCopyLoop (i32.lt_u (local.get $source) (local.get $dest))
      (then
        loop $copyLoop
          (br_if $exitCopyLoop (i32.eq (local.get $numBytes) (i32.const 0)))
          (local.set $numBytes (i32.sub (local.get $numBytes) (i32.const 1)))
          (i32.store8
            (i32.add (local.get $dest) (local.get $numBytes))
            (i32.load8_u (i32.add (local.get $source) (local.get $numBytes))))
          (br $copyLoop)
        end
      ) (else
        loop $copyLoop
          (br_if $exitCopyLoop (i32.eq (local.get $numBytes) (i32.const 0)))
          (i32.store8
            (local.get $dest)
            (i32.load8_u (local.get $source)))
          (local.set $numBytes (i32.sub (local.get $numBytes) (i32.const 1)))
          (local.set $dest (i32.add (local.get $dest) (i32.const 1)))
          (local.set $source (i32.add (local.get $source) (i32.const 1)))
          (br $copyLoop)
        end
      )
    )
  )
)

(benchmark "memory.copy (forward, 8B)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 8)))
(benchmark "memory.copy (forward, 16B)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 16)))
(benchmark "memory.copy (forward, 32B)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 32)))
(benchmark "memory.copy (forward, 64B)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 64)))
(benchmark "memory.copy (forward, 128B)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 128)))
(benchmark "memory.copy (forward, 256B)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 256)))
(benchmark "memory.copy (forward, 512B)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 512)))
(benchmark "memory.copy (forward, 1KB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 1024)))
(benchmark "memory.copy (forward, 2KB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 2048)))
(benchmark "memory.copy (forward, 4KB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 4096)))
(benchmark "memory.copy (forward, 8KB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 8192)))
(benchmark "memory.copy (forward, 16KB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 16384)))
(benchmark "memory.copy (forward, 32KB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 32768)))
(benchmark "memory.copy (forward, 64KB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 65536)))
(benchmark "memory.copy (forward, 128KB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 131072)))
(benchmark "memory.copy (forward, 256KB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 262144)))
(benchmark "memory.copy (forward, 512KB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 524288)))
(benchmark "memory.copy (forward, 1MB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 1048576)))
(benchmark "memory.copy (forward, 2MB)" (invoke "memory.copy" (i32.const 0) (i32.const 8) (i32.const 2097152)))

(benchmark "memory.copy (reverse, 8B)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 8)))
(benchmark "memory.copy (reverse, 16B)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 16)))
(benchmark "memory.copy (reverse, 32B)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 32)))
(benchmark "memory.copy (reverse, 64B)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 64)))
(benchmark "memory.copy (reverse, 128B)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 128)))
(benchmark "memory.copy (reverse, 256B)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 256)))
(benchmark "memory.copy (reverse, 512B)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 512)))
(benchmark "memory.copy (reverse, 1KB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 1024)))
(benchmark "memory.copy (reverse, 2KB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 2048)))
(benchmark "memory.copy (reverse, 4KB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 4096)))
(benchmark "memory.copy (reverse, 8KB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 8192)))
(benchmark "memory.copy (reverse, 16KB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 16384)))
(benchmark "memory.copy (reverse, 32KB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 32768)))
(benchmark "memory.copy (reverse, 64KB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 65536)))
(benchmark "memory.copy (reverse, 128KB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 131072)))
(benchmark "memory.copy (reverse, 256KB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 262144)))
(benchmark "memory.copy (reverse, 512KB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 524288)))
(benchmark "memory.copy (reverse, 1MB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 1048576)))
(benchmark "memory.copy (reverse, 2MB)" (invoke "memory.copy" (i32.const 8) (i32.const 0) (i32.const 2097152)))

(benchmark "memory.copy constant size (forward, 8B)" (invoke "memory.copy8" (i32.const 0) (i32.const 8)))
(benchmark "memory.copy constant size (forward, 16B)" (invoke "memory.copy16" (i32.const 0) (i32.const 8)))
(benchmark "memory.copy constant size (forward, 32B)" (invoke "memory.copy32" (i32.const 0) (i32.const 8)))
(benchmark "memory.copy constant size (forward, 64B)" (invoke "memory.copy64" (i32.const 0) (i32.const 8)))
(benchmark "memory.copy constant size (forward, 128B)" (invoke "memory.copy128" (i32.const 0) (i32.const 8)))
(benchmark "memory.copy constant size (forward, 256B)" (invoke "memory.copy256" (i32.const 0) (i32.const 8)))

(benchmark "memory.copy constant size (reverse, 8B)" (invoke "memory.copy8" (i32.const 8) (i32.const 0)))
(benchmark "memory.copy constant size (reverse, 16B)" (invoke "memory.copy16" (i32.const 8) (i32.const 0)))
(benchmark "memory.copy constant size (reverse, 32B)" (invoke "memory.copy32" (i32.const 8) (i32.const 0)))
(benchmark "memory.copy constant size (reverse, 64B)" (invoke "memory.copy64" (i32.const 8) (i32.const 0)))
(benchmark "memory.copy constant size (reverse, 128B)" (invoke "memory.copy128" (i32.const 8) (i32.const 0)))
(benchmark "memory.copy constant size (reverse, 256B)" (invoke "memory.copy256" (i32.const 8) (i32.const 0)))

(benchmark "i64 copy loop (forward, 8B)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 8)))
(benchmark "i64 copy loop (forward, 16B)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 16)))
(benchmark "i64 copy loop (forward, 32B)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 32)))
(benchmark "i64 copy loop (forward, 64B)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 64)))
(benchmark "i64 copy loop (forward, 128B)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 128)))
(benchmark "i64 copy loop (forward, 256B)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 256)))
(benchmark "i64 copy loop (forward, 512B)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 512)))
(benchmark "i64 copy loop (forward, 1KB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 1024)))
(benchmark "i64 copy loop (forward, 2KB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 2048)))
(benchmark "i64 copy loop (forward, 4KB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 4096)))
(benchmark "i64 copy loop (forward, 8KB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 8192)))
(benchmark "i64 copy loop (forward, 16KB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 16384)))
(benchmark "i64 copy loop (forward, 32KB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 32768)))
(benchmark "i64 copy loop (forward, 64KB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 65536)))
(benchmark "i64 copy loop (forward, 128KB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 131072)))
(benchmark "i64 copy loop (forward, 256KB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 262144)))
(benchmark "i64 copy loop (forward, 512KB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 524288)))
(benchmark "i64 copy loop (forward, 1MB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 1048576)))
(benchmark "i64 copy loop (forward, 2MB)" (invoke "i64 copy loop" (i32.const 0) (i32.const 8) (i32.const 2097152)))

(benchmark "i64 copy loop (reverse, 8B)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 8)))
(benchmark "i64 copy loop (reverse, 16B)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 16)))
(benchmark "i64 copy loop (reverse, 32B)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 32)))
(benchmark "i64 copy loop (reverse, 64B)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 64)))
(benchmark "i64 copy loop (reverse, 128B)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 128)))
(benchmark "i64 copy loop (reverse, 256B)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 256)))
(benchmark "i64 copy loop (reverse, 512B)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 512)))
(benchmark "i64 copy loop (reverse, 1KB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 1024)))
(benchmark "i64 copy loop (reverse, 2KB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 2048)))
(benchmark "i64 copy loop (reverse, 4KB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 4096)))
(benchmark "i64 copy loop (reverse, 8KB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 8192)))
(benchmark "i64 copy loop (reverse, 16KB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 16384)))
(benchmark "i64 copy loop (reverse, 32KB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 32768)))
(benchmark "i64 copy loop (reverse, 64KB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 65536)))
(benchmark "i64 copy loop (reverse, 128KB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 131072)))
(benchmark "i64 copy loop (reverse, 256KB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 262144)))
(benchmark "i64 copy loop (reverse, 512KB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 524288)))
(benchmark "i64 copy loop (reverse, 1MB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 1048576)))
(benchmark "i64 copy loop (reverse, 2MB)" (invoke "i64 copy loop" (i32.const 8) (i32.const 0) (i32.const 2097152)))

(benchmark "i8 copy loop (forward, 8B)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 8)))
(benchmark "i8 copy loop (forward, 16B)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 16)))
(benchmark "i8 copy loop (forward, 32B)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 32)))
(benchmark "i8 copy loop (forward, 64B)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 64)))
(benchmark "i8 copy loop (forward, 128B)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 128)))
(benchmark "i8 copy loop (forward, 256B)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 256)))
(benchmark "i8 copy loop (forward, 512B)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 512)))
(benchmark "i8 copy loop (forward, 1KB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 1024)))
(benchmark "i8 copy loop (forward, 2KB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 2048)))
(benchmark "i8 copy loop (forward, 4KB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 4096)))
(benchmark "i8 copy loop (forward, 8KB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 8192)))
(benchmark "i8 copy loop (forward, 16KB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 16384)))
(benchmark "i8 copy loop (forward, 32KB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 32768)))
(benchmark "i8 copy loop (forward, 64KB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 65536)))
(benchmark "i8 copy loop (forward, 128KB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 131072)))
(benchmark "i8 copy loop (forward, 256KB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 262144)))
(benchmark "i8 copy loop (forward, 512KB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 524288)))
(benchmark "i8 copy loop (forward, 1MB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 1048576)))
(benchmark "i8 copy loop (forward, 2MB)" (invoke "i8 copy loop" (i32.const 0) (i32.const 8) (i32.const 2097152)))
(benchmark "i8 copy loop (reverse, 8B)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 8)))
(benchmark "i8 copy loop (reverse, 16B)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 16)))
(benchmark "i8 copy loop (reverse, 32B)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 32)))
(benchmark "i8 copy loop (reverse, 64B)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 64)))
(benchmark "i8 copy loop (reverse, 128B)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 128)))
(benchmark "i8 copy loop (reverse, 256B)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 256)))
(benchmark "i8 copy loop (reverse, 512B)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 512)))
(benchmark "i8 copy loop (reverse, 1KB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 1024)))
(benchmark "i8 copy loop (reverse, 2KB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 2048)))
(benchmark "i8 copy loop (reverse, 4KB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 4096)))
(benchmark "i8 copy loop (reverse, 8KB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 8192)))
(benchmark "i8 copy loop (reverse, 16KB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 16384)))
(benchmark "i8 copy loop (reverse, 32KB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 32768)))
(benchmark "i8 copy loop (reverse, 64KB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 65536)))
(benchmark "i8 copy loop (reverse, 128KB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 131072)))
(benchmark "i8 copy loop (reverse, 256KB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 262144)))
(benchmark "i8 copy loop (reverse, 512KB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 524288)))
(benchmark "i8 copy loop (reverse, 1MB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 1048576)))
(benchmark "i8 copy loop (reverse, 2MB)" (invoke "i8 copy loop" (i32.const 8) (i32.const 0) (i32.const 2097152)))
